(Partially abridged from r.statistics.co)

library(tidyverse)

Top ggplot2 Visualizations (continued)

Recap

There are 8 main types of visualizations, depending on the primary objective: Correlation, Deviation, Ranking, Distribution, Composition, Change, Groups and Spatial.

In the previous notebook, we focused on Correlation, Deviation, Ranking, Distribution, and Composition.

Change

Time Series Plot From a Time Series Object (ts)

The ggfortify package allows autoplot() to automatically plot directly from a time series object (ts)

library(ggfortify)
# load data
data("AirPassengers")
# check they are a ts object
class(AirPassengers)
#> [1] "ts"
# more info with dplyr::glimpse()
glimpse(AirPassengers)
#>  Time-Series [1:144] from 1949 to 1961: 112 118 132 129 121 135 148 148 136 119 ...
# take a deeper look
AirPassengers
#>      Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
#> 1949 112 118 132 129 121 135 148 148 136 119 104 118
#> 1950 115 126 141 135 125 149 170 170 158 133 114 140
#> 1951 145 150 178 163 172 178 199 199 184 162 146 166
#> 1952 171 180 193 181 183 218 230 242 209 191 172 194
#> 1953 196 196 236 235 229 243 264 272 237 211 180 201
#> 1954 204 188 235 227 234 264 302 293 259 229 203 229
#> 1955 242 233 267 269 270 315 364 347 312 274 237 278
#> 1956 284 277 317 313 318 374 413 405 355 306 271 306
#> 1957 315 301 356 348 355 422 465 467 404 347 305 336
#> 1958 340 318 362 348 363 435 491 505 404 359 310 337
#> 1959 360 342 406 396 420 472 548 559 463 407 362 405
#> 1960 417 391 419 461 472 535 622 606 508 461 390 432
theme_set(theme_classic())

autoplot(AirPassengers) + labs(title = "AirPassengers") + theme(plot.title = element_text(hjust = 0.5))

Time Series Plot From a Data Frame

Using geom_line(), a time series (or line chart) can be drawn from a regular data frame as well. The X axis breaks are generated by default. In the example below, we use the economics data again: the breaks are drawn once every 10 years.

data(economics)

# (re)compute %Returns
economics$returns_perc <- c(0, diff(economics$psavert)/economics$psavert[-length(economics$psavert)])

theme_set(theme_classic())

# Allow Default X Axis Labels
ggplot(economics, aes(x = date)) + geom_line(aes(y = returns_perc)) +
    labs(title = "Time Series Chart", subtitle = "Returns Percentage from 'Economics' Dataset",
        caption = "Source: Economics", y = "Returns %")

Time Series Plot For a Monthly Time Series

If you want to set your own time intervals (breaks) in X axis, you need to set the breaks and labels using scale_x_date().

library(lubridate)
theme_set(theme_bw())

# consider a 24-month timeframe
economics_m <- economics[1:24, ]
# labels and breaks for X axis text
lbls <- paste0(month.abb[month(economics_m$date)], " ", lubridate::year(economics_m$date)) # month.abb is a built-in constant
brks <- economics_m$date

head(brks)
#> [1] "1967-07-01" "1967-08-01" "1967-09-01" "1967-10-01" "1967-11-01"
#> [6] "1967-12-01"
head(lbls)
#> [1] "Jul 1967" "Aug 1967" "Sep 1967" "Oct 1967" "Nov 1967" "Dec 1967"

# plot
ggplot(economics_m, aes(x=date)) + 
    geom_line(aes(y=returns_perc)) + 
    labs(title="Monthly Time Series", 
         subtitle="Returns Percentage from Economics Dataset", 
         caption="Source: Economics", 
         y="Returns %") +  # title and caption
    scale_x_date(labels = lbls, 
                 breaks = brks) +  # change to monthly ticks and labels
    theme(axis.text.x = element_text(angle = 90, vjust=0.5),  # rotate x axis text
          panel.grid.minor = element_blank())  # turn off minor grid

Time Series Plot For a Yearly Time Series

theme_set(theme_bw())

# 7.5 years:
economics_y <- economics[1:90, ]

# labels and breaks for X axis text
brks <- economics_y$date[seq(1, length(economics_y$date), 12)] # one break at each year
lbls <- lubridate::year(brks)

# plot
ggplot(economics_y, aes(x=date)) + 
    geom_line(aes(y=returns_perc)) + 
    labs(title="Yearly Time Series", 
         subtitle="Returns Percentage from Economics Dataset", 
         caption="Source: Economics", 
         y="Returns %") +  # title and caption
    scale_x_date(labels = lbls, 
                 breaks = brks) +  # change to monthly ticks and labels
    theme(axis.text.x = element_text(angle = 90, vjust=0.5),  # rotate x axis text
          panel.grid.minor = element_blank())  # turn off minor grid

Time Series Plot From Long Data Format: Multiple Time Series in Same Dataframe Column

In this example, we construct a ggplot from a long data format. That means, the column names and respective values of all the columns are stacked in just 2 variables (variable and value respectively). If you were to convert this data to wide format, it would look like the economics dataset.

In below example, the geom_line is drawn for the value column and the aes(col) is set to the variable. This way, with just one call to geom_line, multiple colored lines are drawn, each for each unique value in the variable column. The scale_x_date() changes the X axis breaks and labels, and scale_color_manual changes the color of the lines.

data(economics_long, package = "ggplot2")
head(economics_long)

theme_set(theme_bw())
# filter & restrict to specific year range
dataf <- economics_long %>% dplyr::filter(variable %in% c("psavert", "uempmed"),
                                          lubridate::year(date) %in% c(1967:1981))

table(dataf$variable)
#> 
#> psavert uempmed 
#>     174     174

# labels and breaks for X axis text
brks <- dataf$date[seq(1, length(dataf$date), 12)] # one break at each year
lbls <- lubridate::year(brks)

# plot
ggplot(dataf, aes(x=date)) + 
    geom_line(aes(y=value, col=variable)) + 
    labs(title="Time Series of Returns Percentage", 
         subtitle="Drawn from Long Data format", 
         caption="Source: Economics", 
         color=NULL) +  # title and caption
    scale_x_date(labels = lbls, breaks = brks) +  # change to monthly ticks and labels
    scale_color_manual(labels = c("psavert", "uempmed"), 
                       values = c("psavert"="#00ba38", "uempmed"="#f8766d")) +  # line color
    theme(axis.text.x = element_text(angle = 90, vjust=0.5, size = 8),  # rotate x axis text
          panel.grid.minor = element_blank())  # turn off minor grid

Time Series Plot From Wide Data Format: Data in Multiple Columns of Dataframe

As previously noted in this tutorial, whenever your plot’s geom (points, lines, bars, etc) changes the fill, size, col, shape or stroke based on another column, a legend is automatically drawn.

If you are creating a time series (or even other types of plots) from a wide data format, you have to draw each line manually by calling geom_line() once for every line. To customize the legend, you can use the scale_aesthetic_manual() format of functions (like, scale_color_manual() if only the color of your lines changes). Using this function, you can give a legend title with the name argument, tell what color the legend should take with the values argument and also set the legend labels.

Even though the below plot looks exactly like the previous one, the approach to construct it is a little different.

theme_set(theme_bw())

dataf <- economics %>% dplyr::select(date, psavert, uempmed) %>% 
    dplyr::filter(lubridate::year(date) %in% c(1967:1981))
head(dataf)

# labels and breaks for X axis text
brks <- dataf$date[seq(1, length(dataf$date), 12)]
lbls <- lubridate::year(brks)

# plot
ggplot(dataf, aes(x=date)) + 
  geom_line(aes(y=psavert, col="psavert")) + # 1st line
  geom_line(aes(y=uempmed, col="uempmed")) + # 2nd line
  labs(title="Time Series of Returns Percentage", 
       subtitle="Drawn From Wide Data format", 
       caption="Source: Economics", y="value") +  # title and caption
  scale_x_date(labels = lbls, breaks = brks) +  # change to monthly ticks and labels
  scale_color_manual(name="",
                     values = c("psavert"="#00ba38", "uempmed"="#f8766d")) +  # line color
  theme(axis.text.x = element_text(angle = 90, vjust=0.5, size = 8),
        panel.grid.minor = element_blank())  # turn off minor grid

Stacked Area Chart

A stacked area chart is just like a line chart, except that the region below the line is filled with a color. This is typically used when:

  1. You want to describe how a quantity or volume (rather than something like price) changed over time
  2. You have many data points. For very few data points, a bar chart is a better solution.
  3. You want to show the contribution from individual components.

This can be plotted using geom_area, which works very much like geom_line. But there is an important point to note. By default, each geom_area starts from the bottom of Y axis (which is typically 0), but, if you want to show the contribution from individual components, you want the geom_area to be stacked over the top of previous component, rather than the floor of the plot itself. So, you have to add all the bottom layers while setting the y in geom_area().

In below example, I have set it as y=psavert+uempmed for the topmost geom_area.

However nice the plot looks, the caveat is that it can easily become complicated and uninterpretable if there are too many components.

theme_set(theme_bw())

dataf <- economics %>% dplyr::select(date, psavert, uempmed) %>% 
    dplyr::filter(lubridate::year(date) %in% c(1967:1981))

# labels and breaks for X axis text
brks <- dataf$date[seq(1, length(dataf$date), 12)]
lbls <- lubridate::year(brks)

# plot
ggplot(dataf, aes(x=date)) + 
    geom_area(aes(y=psavert+uempmed, fill="psavert")) + # 1st "layer"
    geom_area(aes(y=uempmed, fill="uempmed")) + # 2nd "layer" (plotted over the 1st)
    labs(title="Area Chart of Returns Percentage", 
         subtitle="From Wide Data format", 
         caption="Source: Economics", y="value") +  # title and caption
    scale_x_date(labels = lbls, breaks = brks) +  # change to monthly ticks and labels
    scale_fill_manual(name="", 
                      values = c("psavert"="#00ba38", "uempmed"="#f8766d")) +  # line color
    theme(panel.grid.minor = element_blank())  # turn off minor grid

Calendar Heatmap

When you want to see the variation, especially the highs and lows, of a metric like a stock price on an actual calendar, the calendar heat map is a great tool. It emphasizes the variation visually over time rather than the actual value itself.

This can be implemented using geom_tile. However, getting it in the right format has more to do with the data preparation rather than the plotting itself.

library(plyr)
library(scales)
library(zoo)

dataf <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/yahoo.csv")  # Yahoo! stock closing price 2007-2016
dataf$date <- as.Date(dataf$date)  # format date
dataf <- dataf[dataf$year >= 2012, ]  # filter years

# Create Month Week
dataf$yearmonth <- as.yearmon(dataf$date)
dataf$yearmonthf <- factor(dataf$yearmonth)
dataf <- ddply(dataf, .(yearmonthf), transform, monthweek = 1 +
    week - min(week))  # compute week number of month
dataf <- dataf[, c("year", "yearmonthf", "monthf", "week", "monthweek",
    "weekdayf", "VIX.Close")]
head(dataf)

ggplot(dataf, aes(monthweek, weekdayf, fill = VIX.Close)) + geom_tile(colour = "white") +
    facet_grid(year ~ monthf) + scale_fill_gradient(low = "red",
    high = "green") + labs(x = "Week of Month", y = "", title = "Time-Series Calendar Heatmap",
    subtitle = "Yahoo Closing Price", fill = "Close")

Slope Chart

Slope chart is a great tool of you want to visualize change in value and ranking between categories. This is more suitable over a time series when there are very few time points.

theme_set(theme_classic())
source_df <- read.csv("https://raw.githubusercontent.com/jkeirstead/r-slopegraph/master/cancer_survival_rates.csv")  # Estimates of % survival rates for different tumors

# Define functions. Source:
# https://github.com/jkeirstead/r-slopegraph Calculates
# slope graph positions based on Edward Tufte's layout
tufte_sort <- function(df, x = "year", y = "value", group = "group",
    min.space = 0.05) {
    ## First rename the columns for consistency
    ids <- match(c(x, y, group), names(df))
    df <- df[, ids]
    names(df) <- c("x", "y", "group")

    ## Expand grid to ensure every combination has a
    ## defined value
    tmp <- expand.grid(x = unique(df$x), group = unique(df$group))
    tmp <- merge(df, tmp, all.y = TRUE)
    df <- mutate(tmp, y = ifelse(is.na(y), 0, y))

    ## Cast into a matrix shape and arrange by first column
    require(reshape2)
    tmp <- dcast(df, group ~ x, value.var = "y")
    ord <- order(tmp[, 2])
    tmp <- tmp[ord, ]

    min.space <- min.space * diff(range(tmp[, -1]))
    yshift <- numeric(nrow(tmp))
    ## Start at 'bottom' row Repeat for rest of the rows
    ## until you hit the top
    for (i in 2:nrow(tmp)) {
        ## Shift subsequent row up by equal space so gap
        ## between two entries is >= minimum
        mat <- as.matrix(tmp[(i - 1):i, -1])
        d.min <- min(diff(mat))
        yshift[i] <- ifelse(d.min < min.space, min.space - d.min,
            0)
    }

    tmp <- cbind(tmp, yshift = cumsum(yshift))

    scale <- 1
    tmp <- melt(tmp, id = c("group", "yshift"), variable.name = "x",
        value.name = "y")
    ## Store these gaps in a separate variable so that they
    ## can be scaled ypos = a*yshift + y

    tmp <- transform(tmp, ypos = y + scale * yshift)
    return(tmp)

}

plot_slopegraph <- function(df) {
    ylabs <- subset(df, x == head(x, 1))$group
    yvals <- subset(df, x == head(x, 1))$ypos
    fontSize <- 3
    gg <- ggplot(df, aes(x = x, y = ypos)) + geom_line(aes(group = group),
        colour = "grey80") + geom_point(colour = "white", size = 8) +
        geom_text(aes(label = y), size = fontSize, family = "American Typewriter") +
        scale_y_continuous(name = "", breaks = yvals, labels = ylabs)
    return(gg)
}

## Prepare data
dataf <- tufte_sort(source_df, x = "year", y = "value", group = "group",
    min.space = 0.05)

dataf <- transform(dataf, x = factor(x, levels = c(5, 10, 15,
    20), labels = c("5 years", "10 years", "15 years", "20 years")),
    y = round(y))

## Plot
plot_slopegraph(dataf) + labs(title = "Estimates of % survival rates") +
    theme(axis.title = element_blank(), axis.ticks = element_blank(),
        plot.title = element_text(hjust = 0.5, family = "American Typewriter",
            face = "bold"), axis.text = element_text(family = "American Typewriter",
            face = "bold"))

Seasonal Plot

If you are working with a time series object of class ts or xts, you can view the seasonal fluctuations through a seasonal plot drawn using forecast::ggseasonplot. Below is an example using the native AirPassengers and nottem time series.

You can see the traffic increase in air passengers over the years along with the repetitive seasonal patterns in traffic.

library(forecast)
theme_set(theme_classic())

# Subset data
nottem_small <- window(nottem, start = c(1920, 1), end = c(1925,
    12))  # subset a smaller timewindow

# Plot
ggseasonplot(AirPassengers) + labs(title = "Seasonal plot: International Airline Passengers")

The temperatures at Nottingham castle do not show an increase over the years, but they definitely follow a seasonal pattern:

ggseasonplot(nottem_small) + labs(title = "Seasonal plot: Air temperatures at Nottingham Castle")

Groups

Hierarchical Dendrogram

First we compute a hierarchical clustering with hclust(), then we plot the computed dendrogram with the dedicated function ggdendrogram(), part of the ggdendro library.

We see an example on the classic dataset USArrests, consisting of the number of arrests per 100K residents for 3 different crimes in each of the 50 USA states in 1973.

head(USArrests)

# install.packages(ggdendro)
library(ggdendro)
theme_set(theme_bw())

hc <- hclust(dist(USArrests), method = "average")  # hierarchical clustering

ggdendrogram(hc, rotate = TRUE, size = 2)

Clusters

It is possible to show the distinct clusters or groups using geom_encircle(). We already saw an example in this lab series, with a scatterplot. If the dataset has multiple weak features, you can compute the principal components and draw a scatterplot using PC1 and PC2 as X and Y axis.

The geom_encircle() can be used to encircle the desired groups. The only thing to note is the data argument to geom_encircle(): you need to provide a subsetted dataframe that contains only the observations (rows) that belong to the group you want to be encircled - exactly what we previously did in the scatterplot example.

# load/reload libraries as needed
library(ggalt)
library(ggfortify)
theme_set(theme_classic())

# we'll use the Iris dataset
head(iris)
# filter out the Species column
dataf <- iris %>% dplyr::select(-Species)
# compute the principal components
pca_mod <- prcomp(dataf)
# convert to dataframe & add back the Species column
df_pc <- data.frame(pca_mod$x, Species=iris$Species)
# create the subsetted dataframes to be encircled in the plot
df_pc_vir <- df_pc %>% dplyr::filter(Species == "virginica") # df for 'virginica'
df_pc_set <- df_pc %>% dplyr::filter(Species == "setosa")  # df for 'setosa'
df_pc_ver <- df_pc %>% dplyr::filter(Species == "versicolor")  # df for 'versicolor'
 
ggplot(df_pc, aes(PC1, PC2, col=Species)) + # base call
    geom_point(aes(shape=Species), size=2) + # add points
    labs(title="Iris Clusters", 
         subtitle="With principal components PC1 and PC2 as X and Y axis",
         caption="Source: Iris") + 
    coord_cartesian(xlim = 1.2 * c(min(df_pc$PC1), max(df_pc$PC1)), 
                    ylim = 1.2 * c(min(df_pc$PC2), max(df_pc$PC2))) + # change axis limits (without deleting points)
    geom_encircle(data = df_pc_vir, aes(x=PC1, y=PC2)) +   # draw circles
    geom_encircle(data = df_pc_set, aes(x=PC1, y=PC2)) + 
    geom_encircle(data = df_pc_ver, aes(x=PC1, y=PC2))

Interactivity

Using the library plotly:

library(plotly)
data <- data.frame(cond = rep(c("condition_1", "condition_2"),
    each = 10), my_x = 1:100 + rnorm(100, sd = 9), my_y = 1:100 +
    rnorm(100, sd = 16))

my_graph <- ggplot(data, aes(x = my_x, y = my_y)) + geom_point(shape = 1)

# Let's make it interactive using the ggplotly function !
p <- ggplotly(my_graph)
p

Animation

Animation in ggplot2 can be obtained through the gganimate package, which has been recently restructured, thus it is not unlikely that some older code might not work with the current version 1.0.7.

In what follows we present a couple of examples of animation, from How to create animations in R with gganimate and Visualizing 75 Years of Measles Incidence Using Gganimate.

One of the key aspects of a good animation is that the user knows why the graph is moving, that is, that the user knows what each moment on the animation means.

To do so, the gganimate functions include a set of transition functions that interpret the plot data to distribute them over a number of frames. The transition functions make the following variables (a.k.a. “frame variables”) available for string literal interpretation, meaning that you can allow your labels to include data from the animation:

Name of the function Labs variable
transition_components frame_time
transition_events frame_time
transition_filter previous_filter, closest_filter, next_filter
transition_layer previous_layer, closest_layer, next_layer, nlayers
transition_manual previous_frame, current_frame, next_frame
transition_reveal frame_along
transition_states previous_state, closest_state, next_state
transition_time frame_time

A first example

Let’s prepare an example using the gapminder dataset from the same package.

library(gapminder)
data <- gapminder
head(data)

In order to create our animation, we will begin by creating a static ggplot graph. If a lot of data overlays on a single state (in our case the year) the graph won’t be very visual. So, we will do it just for one country so that it is much more visual.

my_graph <- data %>%
    ggplot(aes(x = gdpPercap, y = lifeExp, col = continent, size = pop)) +
    geom_point(alpha = 0.8) + theme_minimal() + theme(legend.position = "bottom") +
    guides(size = "none") + labs(x = "GDP per Capita", y = "Life Expectancy",
    col = "")

show(my_graph)  # same as plot(my_graph) or print(my_graph)

Now we can create the animation simply by passing the transition function and gganimate will create the animation. Yes, with just one function we can create an animation! Let’s see it:

library(gganimate)

p <- my_graph + transition_time(year)
animate(p, width = 700, height = 450, renderer = ffmpeg_renderer(format = "webm"))

While you can create an animation with only one function, there are a lot of settings to adjust to make the animation look awesome: we will now learn how to improve that animation.

We’ll start by adding a dynamic title using the frame variable frame_time:

p <- my_graph + transition_time(year) + labs(title = "Year: {frame_time}")

animate(p, width = 700, height = 450, renderer = ffmpeg_renderer(format = "webm"))

As you can see, the date is now included in the title, but:

  • it is small
  • it is out of the graph
  • it prevents using the title for telling other things.

It is much more visual and impactful to include the varying data on the same graph through an extra ggplot layer: for example, a textual annotation (geom_text). In this way, we can use the title for whatever we want, we can give the transition state the look and feel that we want, and it’s all within the graph.

Let’s see an example:

p <- my_graph + geom_text(aes(x = min(gdpPercap), y = min(lifeExp),
    label = as.factor(year)), hjust = -2, vjust = -0.2, alpha = 0.2,
    col = "gray", size = 20) + transition_states(as.factor(year),
    state_length = 0)

animate(p, width = 700, height = 450, renderer = ffmpeg_renderer(format = "webm"))

As you can see, in this case, I have used the function transition_states instead of transition_time and I have also changed the year variable to a factor. The reason is that the transition functions interpolate numeric data, which makes it look terribly bad. When we convert the number into a factor the problem disappears.

Besides, I have also included the parameter state_length=0. This parameter allows us to control for how long the graph will pause before changing to the new state. In my case, I will set as zero, because with higher values the transition wouldn’t be smooth.

Note that the scales of the animation do not change during the animation. This generates two things:

  1. The growth is not that well represented. By having the final value visible from the beginning, we do not have as much growth prospects.
  2. If the scale is very wide and the initial values are low (for example in aggregated data), we lose perspective of what happens at the beginning, when the values are low.

If we want to avoid this and to better see how the variables grow, it is better to adjust the scale in each frame. For this, we will use the view_follow() function.

A very clear impact of this issue is the impact on the evolution graphs. Let’s see an example of the evolution of the Spanish GDP.

# static version
p_static <- data %>%
    filter(country == "Spain") %>%
    ggplot(aes(year, pop)) + geom_point() + geom_line() + theme_minimal()

print(p_static)


# animation
p <- p_static + transition_reveal(year)

animate(p, width = 700, height = 450, renderer = ffmpeg_renderer(format = "webm"))

As you can see, since we do not change the scales, the plot does not look that alive and it’s not so impactful. However, if we make the axis scales change automatically, the speed of the change will make us better see how the GPD per Capita has evolved.

p_static <- data %>%
    filter(country == "Spain") %>%
    ggplot(aes(year, pop)) + geom_point() + geom_line() + geom_text(aes(x = min(year),
    y = min(pop), label = as.factor(year)), hjust = -2, vjust = -0.2,
    alpha = 0.5, col = "gray", size = 20) + theme_minimal()

print(p_static)  # in the static plot we'll see the text annotations stacked above each other


p <- p_static + transition_reveal(year) + view_follow()  # change the axis scales automatically

animate(p, width = 700, height = 450, renderer = ffmpeg_renderer(format = "webm"))

As you can see, these two tricks have significantly improved our animation. But there is still a very important thing to learn: the animation renderization.

Rendering

To render is to convert our R commands into an animation. At this step we can personalize a lot of the key elements of our animations, such as:

  • The width and height of the animation to create an animation that it is correctly seen on the device that we want to see it.
  • Duration, number of frames, and number of frames per second (fps): this will make you the animation see fluently.
  • Output file format: if you don’t want to create a gif, you can also create a video too.

To render our animation, first we need to save the result of the animation as an object. By doing so, we can now pass this object to the animate function. This function has many other parameters with which we can adjust the things we have commented on previously:

  • fps: the human being is able to distinguish between 10 and 12 frames per second. If we add more frames, the brain does not see many images together, but rather it sees an animation. Thus, the fps parameter should always be higher than 12. I would recommend setting it at 25fps, as it balances between fluentness and lightness.
  • duration: it sets for how long should the animation long. This will depend on the number of states that we have. In my opinion, for transitions with a lot of states, I would set the state to last for 0.5 seconds. Anyway, there is not a global option and I would recommend trying several durations.

Example: bar chart race

Now we will apply everything that we have learned on how to create animations in R with gganimate to create an awesome animation: a bar chart race.

To create our bar chart race we will analyze the evolution of the countries with the highest GDP per capita on the gapminder dataset. To do so, first we need to get the ranking of the countries on each year. This is something that we can easily do with dplyr:

data2 <- data %>%
    group_by(year) %>%
    arrange(year, desc(gdpPercap)) %>%
    mutate(ranking = row_number()) %>%
    filter(ranking <= 15)
head(data2)

After that, we can create the bar chart race animation with gganimate. In this case, we will include the functions enter_fade and exit_fade, which will create a fade off effect when the countries appear or disappear. Besides, we will use the function ease_aes to create a non-linear animation that looks better:

static_gdp <- data2 %>%
    ggplot(aes(ranking, gdpPercap, fill = country)) +
    geom_col() + # geom_col is like geom_bar but it uses a identity stat instead of count
    geom_text(aes(ranking, gdpPercap, label = gdpPercap), hjust=-0.1) +
    geom_text(aes(ranking, y=0 , label = country), hjust=1.1) + 
    geom_text(aes(x=15, y=max(gdpPercap) , label = as.factor(year)), vjust = 0.2, alpha = 0.5,  col = "gray", size = 20) +
    coord_flip(clip = "off", expand = FALSE) + scale_x_reverse() +
    theme_minimal() + theme(
        panel.grid = element_blank(), 
        legend.position = "none",
        axis.ticks.y = element_blank(),
        axis.title.y = element_blank(),
        axis.text.y = element_blank(),
        plot.margin = margin(1, 4, 1, 3, "cm")
    )
plot(static_gdp)


my_animation <- static_gdp +
    transition_states(year, state_length=0, transition_length=2) +
    enter_fade() +
    exit_fade() + 
    ease_aes('quadratic-in-out') 

animate(my_animation, width=700, height=432, fps=25, duration=15, rewind=FALSE, renderer=ffmpeg_renderer(format = "webm"))

Animation with maps: 75 years of measles incidence

# load/reload libraries
library(tidyverse)
library(gganimate)
library(grid)
library(sf)
library(usmap)
library(scales)
library(Hmisc)
diseases <- read.csv(file = "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-12-10/diseases.csv")
diseases %>%
    dim()
#> [1] 18870     6
diseases %>%
    colnames()
#> [1] "disease"         "state"           "year"            "weeks_reporting"
#> [5] "count"           "population"
head(diseases)

diseases %>%
    summary()
#>    disease             state                year      weeks_reporting
#>  Length:18870       Length:18870       Min.   :1928   Min.   : 0.00  
#>  Class :character   Class :character   1st Qu.:1956   1st Qu.:14.00  
#>  Mode  :character   Mode  :character   Median :1977   Median :44.00  
#>                                        Mean   :1974   Mean   :33.28  
#>                                        3rd Qu.:1992   3rd Qu.:50.00  
#>                                        Max.   :2011   Max.   :52.00  
#>                                                                      
#>      count            population      
#>  Min.   :     0.0   Min.   :   86853  
#>  1st Qu.:     1.0   1st Qu.: 1046542  
#>  Median :    47.0   Median : 2824918  
#>  Mean   :  1367.5   Mean   : 4242911  
#>  3rd Qu.:   440.8   3rd Qu.: 5153640  
#>  Max.   :132342.0   Max.   :37607525  
#>                     NA's   :204
unique(diseases$disease)
#> [1] "Hepatitis A" "Measles"     "Mumps"       "Pertussis"   "Polio"      
#> [6] "Rubella"     "Smallpox"
diseases %>%
    group_by(disease) %>%
    summarise(sum(count))

I summarized the cases of each disease by counts to get an idea of which one had the most cases in all of US over the span of the ~75 years in the dataset. As anticipated, measles had the greatest number, with >18M cases in the United States from 1928 to 2003.

To compare the number of cases between states, I mutated the dataset to add an incidence rate that takes into account the case count over state population by 100,000 people for each state per year.

diseases <- diseases %>%
    mutate(incidence = (count/population * 1e+05 * (ifelse(weeks_reporting ==
        0, 0, 52/weeks_reporting))))

Because measles had the most cases of all diseases in the dataset, I decided to filter the dataset and explore measles for my visualization

diseases %>%
    filter(disease == "Measles") %>%
    summary()
#>    disease             state                year      weeks_reporting
#>  Length:3876        Length:3876        Min.   :1928   Min.   : 0.00  
#>  Class :character   Class :character   1st Qu.:1947   1st Qu.:32.00  
#>  Mode  :character   Mode  :character   Median :1966   Median :47.00  
#>                                        Mean   :1966   Mean   :37.45  
#>                                        3rd Qu.:1984   3rd Qu.:50.00  
#>                                        Max.   :2003   Max.   :52.00  
#>                                                                      
#>      count          population         incidence       
#>  Min.   :     0   Min.   :   86853   Min.   :   0.000  
#>  1st Qu.:     8   1st Qu.:  966309   1st Qu.:   0.668  
#>  Median :   543   Median : 2590843   Median :  32.108  
#>  Mean   :  4817   Mean   : 3858493   Mean   : 179.939  
#>  3rd Qu.:  3986   3rd Qu.: 4696902   3rd Qu.: 244.762  
#>  Max.   :132342   Max.   :34861711   Max.   :3670.243  
#>                   NA's   :64         NA's   :64

measles <- diseases %>%
    filter(disease == "Measles") %>%
    mutate(incidence_rounded = round(incidence, digits = 1))
head(measles)

The usmaps package has a fips() function that outputs the FIPS code (Federal Information Processing Standards) for a state name string input. In essence, the FIPS cose is a number that uniquely identifies a geographic area. To have the state level data plotted onto the map, a FIPS column corresponding to each state needs to be created.

measles$fips <- fips(measles$state)

There is another important piece of information to account for in the visualization to make sense of the changes in trends: vaccine introduction. The measles vaccine was introduced in 1963 in the US, with a second dose recommended by 1989. To show this on the map, I created a factor variable that maps each year to three phases of vaccine availability: no vaccine (before 1963), vaccine introduction (1963-1989), and the years following the recommendation of the second dose (after 1989). This column will be used as a subtitle on the map to demonstrate the trends relative to vaccine availability over the course of the years.

measles$vaccine <- ifelse((measles$year >= 1963) & (measles$year <
    1989), 2, ifelse((measles$year >= 1989), 3, ifelse((measles$year <
    1963), 1, 1)))
measles$vaccine <- recode(measles$vaccine, `1` = "", `2` = "Vaccine introduced",
    `3` = "Second dose recommended")

If you take a look at the summary stats for the incidence in the measles data, you can see how variability between the rates would unevenly polarize rates above and below the third quartile, which would obscure interpretation of the rates on a gradient scale.

measles %>%
    select(incidence) %>%
    summary()
#>    incidence       
#>  Min.   :   0.000  
#>  1st Qu.:   0.668  
#>  Median :  32.108  
#>  Mean   : 179.939  
#>  3rd Qu.: 244.762  
#>  Max.   :3670.243  
#>  NA's   :64

Instead of using the default scale in scale_color_gradient(), which creates breaks by evenly spacing the intervals, quartiles are a better quantification of breaks in the dataset. A breaks vector is created to be read into the breaks argument of the scale_color_gradient() function in the plot.

# compute the incidence quartiles
breaks <- quantile(measles$incidence_rounded, probs = seq(0,
    1, 0.25), na.rm = TRUE) %>%
    unname() %>%
    round(digits = 1)

Now we build the vector for a manual gradient color scheme and the resolution for the animation.

# color palette (you can also pick colors from a predefined
# RColorBrewer or ggsci palette)
gradient <- c("#B1C055", "#6CC682", "#39C2B6", "#6BB4D7", "#B69DD1",
    "#E685A8", "#ED7F72")

And now… create the plot!

library(transformr)
measles_plot <- plot_usmap(data=measles, color="#262626", size=.3, values="incidence_rounded") +
    theme_void() +
    scale_fill_gradientn(colors = gradient, trans="pseudo_log", 
                         #The pseudo_log allows for log transformation even though 0 is in the dataset
                         na.value="grey",limits=c(min(breaks), max(breaks)), breaks=breaks[c(1,3:5)], labels=breaks[c(1,3:5)]) +
    #only including min, 50th, 75th and max values  
    guides(fill = guide_colorbar(title="",
                                 frame.colour = "black",
                                 label.position="top",
                                 barwidth = 8,
                                 barheight = 1, 
                                 ticks=FALSE,
                                 keywidth=15,
                                 label.hjust = 0.5,
                                 label.vjust = 0.3,
                                 label.theme = element_text(angle = 45, size=10)))+
    labs(title = "Measles Incidence per 100,000 in {frame_time}",
         subtitle="{unique(measles$vaccine[measles$year==(frame_time)])}")+ # allows for vaccine column to display as a subtitle relative to plot animation
    theme(legend.position="bottom", legend.justification=c(.8,0),
          plot.title=element_text(face="bold", size=14, color="#262626",hjust=.5),
          plot.subtitle=element_text(hjust=.5))+
    transition_time(year)
anim <- animate(measles_plot, width = 700, height = 432, nframes = 76,
    fps = 2, renderer = ffmpeg_renderer(format = "webm"))
anim